Import Statements¶

In [ ]:
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from matplotlib.ticker import FuncFormatter
import warnings

warnings.simplefilter("ignore")

Notebook Presentation¶

In [ ]:
%%HTML
<script src="require.js"></script>
In [ ]:
# 設定pandas顯示格式
pd.options.display.float_format = '{:,.2f}'.format
# 設定plotly呈現
import plotly.io as pio
pio.renderers.default = 'notebook'
In [ ]:
pd.set_option("display.max_columns", 35)
pd.set_option("display.max_rows", 50)

Load Data¶

In [ ]:
df = pd.read_csv("marketing_campaign.csv",sep='\t')

Preliminary Data Exploration¶

In [ ]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2240 entries, 0 to 2239
Data columns (total 29 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   2240 non-null   int64  
 1   Year_Birth           2240 non-null   int64  
 2   Education            2240 non-null   object 
 3   Marital_Status       2240 non-null   object 
 4   Income               2216 non-null   float64
 5   Kidhome              2240 non-null   int64  
 6   Teenhome             2240 non-null   int64  
 7   Dt_Customer          2240 non-null   object 
 8   Recency              2240 non-null   int64  
 9   MntWines             2240 non-null   int64  
 10  MntFruits            2240 non-null   int64  
 11  MntMeatProducts      2240 non-null   int64  
 12  MntFishProducts      2240 non-null   int64  
 13  MntSweetProducts     2240 non-null   int64  
 14  MntGoldProds         2240 non-null   int64  
 15  NumDealsPurchases    2240 non-null   int64  
 16  NumWebPurchases      2240 non-null   int64  
 17  NumCatalogPurchases  2240 non-null   int64  
 18  NumStorePurchases    2240 non-null   int64  
 19  NumWebVisitsMonth    2240 non-null   int64  
 20  AcceptedCmp3         2240 non-null   int64  
 21  AcceptedCmp4         2240 non-null   int64  
 22  AcceptedCmp5         2240 non-null   int64  
 23  AcceptedCmp1         2240 non-null   int64  
 24  AcceptedCmp2         2240 non-null   int64  
 25  Complain             2240 non-null   int64  
 26  Z_CostContact        2240 non-null   int64  
 27  Z_Revenue            2240 non-null   int64  
 28  Response             2240 non-null   int64  
dtypes: float64(1), int64(25), object(3)
memory usage: 507.6+ KB
In [ ]:
df.head()
Out[ ]:
ID Year_Birth Education Marital_Status Income Kidhome Teenhome Dt_Customer Recency MntWines MntFruits MntMeatProducts MntFishProducts MntSweetProducts MntGoldProds NumDealsPurchases NumWebPurchases NumCatalogPurchases NumStorePurchases NumWebVisitsMonth AcceptedCmp3 AcceptedCmp4 AcceptedCmp5 AcceptedCmp1 AcceptedCmp2 Complain Z_CostContact Z_Revenue Response
0 5524 1957 Graduation Single 58,138.00 0 0 04-09-2012 58 635 88 546 172 88 88 3 8 10 4 7 0 0 0 0 0 0 3 11 1
1 2174 1954 Graduation Single 46,344.00 1 1 08-03-2014 38 11 1 6 2 1 6 2 1 1 2 5 0 0 0 0 0 0 3 11 0
2 4141 1965 Graduation Together 71,613.00 0 0 21-08-2013 26 426 49 127 111 21 42 1 8 2 10 4 0 0 0 0 0 0 3 11 0
3 6182 1984 Graduation Together 26,646.00 1 0 10-02-2014 26 11 4 20 10 3 5 2 2 0 4 6 0 0 0 0 0 0 3 11 0
4 5324 1981 PhD Married 58,293.00 1 0 19-01-2014 94 173 43 118 46 27 15 5 5 3 6 5 0 0 0 0 0 0 3 11 0
In [ ]:
df.describe()
Out[ ]:
ID Year_Birth Income Kidhome Teenhome Recency MntWines MntFruits MntMeatProducts MntFishProducts MntSweetProducts MntGoldProds NumDealsPurchases NumWebPurchases NumCatalogPurchases NumStorePurchases NumWebVisitsMonth AcceptedCmp3 AcceptedCmp4 AcceptedCmp5 AcceptedCmp1 AcceptedCmp2 Complain Z_CostContact Z_Revenue Response
count 2,240.00 2,240.00 2,216.00 2,240.00 2,240.00 2,240.00 2,240.00 2,240.00 2,240.00 2,240.00 2,240.00 2,240.00 2,240.00 2,240.00 2,240.00 2,240.00 2,240.00 2,240.00 2,240.00 2,240.00 2,240.00 2,240.00 2,240.00 2,240.00 2,240.00 2,240.00
mean 5,592.16 1,968.81 52,247.25 0.44 0.51 49.11 303.94 26.30 166.95 37.53 27.06 44.02 2.33 4.08 2.66 5.79 5.32 0.07 0.07 0.07 0.06 0.01 0.01 3.00 11.00 0.15
std 3,246.66 11.98 25,173.08 0.54 0.54 28.96 336.60 39.77 225.72 54.63 41.28 52.17 1.93 2.78 2.92 3.25 2.43 0.26 0.26 0.26 0.25 0.11 0.10 0.00 0.00 0.36
min 0.00 1,893.00 1,730.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 3.00 11.00 0.00
25% 2,828.25 1,959.00 35,303.00 0.00 0.00 24.00 23.75 1.00 16.00 3.00 1.00 9.00 1.00 2.00 0.00 3.00 3.00 0.00 0.00 0.00 0.00 0.00 0.00 3.00 11.00 0.00
50% 5,458.50 1,970.00 51,381.50 0.00 0.00 49.00 173.50 8.00 67.00 12.00 8.00 24.00 2.00 4.00 2.00 5.00 6.00 0.00 0.00 0.00 0.00 0.00 0.00 3.00 11.00 0.00
75% 8,427.75 1,977.00 68,522.00 1.00 1.00 74.00 504.25 33.00 232.00 50.00 33.00 56.00 3.00 6.00 4.00 8.00 7.00 0.00 0.00 0.00 0.00 0.00 0.00 3.00 11.00 0.00
max 11,191.00 1,996.00 666,666.00 2.00 2.00 99.00 1,493.00 199.00 1,725.00 259.00 263.00 362.00 15.00 27.00 28.00 13.00 20.00 1.00 1.00 1.00 1.00 1.00 1.00 3.00 11.00 1.00

Data Cleaning - Check for Missing Values and Duplicates¶

In [ ]:
df.isna().sum()
Out[ ]:
ID                      0
Year_Birth              0
Education               0
Marital_Status          0
Income                 24
Kidhome                 0
Teenhome                0
Dt_Customer             0
Recency                 0
MntWines                0
MntFruits               0
MntMeatProducts         0
MntFishProducts         0
MntSweetProducts        0
MntGoldProds            0
NumDealsPurchases       0
NumWebPurchases         0
NumCatalogPurchases     0
NumStorePurchases       0
NumWebVisitsMonth       0
AcceptedCmp3            0
AcceptedCmp4            0
AcceptedCmp5            0
AcceptedCmp1            0
AcceptedCmp2            0
Complain                0
Z_CostContact           0
Z_Revenue               0
Response                0
dtype: int64
In [ ]:
# I want to keep the rows with missing values to do visualization, so I don't drop them here.
df[df.Income.isna()].head()
Out[ ]:
ID Year_Birth Education Marital_Status Income Kidhome Teenhome Dt_Customer Recency MntWines MntFruits MntMeatProducts MntFishProducts MntSweetProducts MntGoldProds NumDealsPurchases NumWebPurchases NumCatalogPurchases NumStorePurchases NumWebVisitsMonth AcceptedCmp3 AcceptedCmp4 AcceptedCmp5 AcceptedCmp1 AcceptedCmp2 Complain Z_CostContact Z_Revenue Response
10 1994 1983 Graduation Married NaN 1 0 15-11-2013 11 5 5 6 0 2 1 1 1 0 2 7 0 0 0 0 0 0 3 11 0
27 5255 1986 Graduation Single NaN 1 0 20-02-2013 19 5 1 3 3 263 362 0 27 0 0 1 0 0 0 0 0 0 3 11 0
43 7281 1959 PhD Single NaN 0 0 05-11-2013 80 81 11 50 3 2 39 1 1 3 4 2 0 0 0 0 0 0 3 11 0
48 7244 1951 Graduation Single NaN 2 1 01-01-2014 96 48 5 48 6 10 7 3 2 1 4 6 0 0 0 0 0 0 3 11 0
58 8557 1982 Graduation Single NaN 1 0 17-06-2013 57 11 3 22 2 2 6 2 2 0 3 6 0 0 0 0 0 0 3 11 0
In [ ]:
df.duplicated().any()
Out[ ]:
False

Data Preprocessing¶

In [ ]:
# This dataset didn't tell us when it was created.
# Thus, I find out the latest date of customer's enrollment with the company and take it as the date of this data to do more calculation.
df["Dt_Customer"] = pd.to_datetime(df["Dt_Customer"])
df["Dt_Customer"].max()
Out[ ]:
Timestamp('2014-12-06 00:00:00')
In [ ]:
# age
df["Age"] = 2014 - df["Year_Birth"]
In [ ]:
# membership_days
df["membership_days"] = df["Dt_Customer"].max() - df["Dt_Customer"]
df["membership_days"] = df["membership_days"].dt.days
In [ ]:
df["Frequency"] = df[["NumWebPurchases","NumCatalogPurchases","NumStorePurchases"]].sum(axis=1)
df["Monetary"] = df.loc[:,"MntWines":"MntGoldProds"].sum(axis=1)

RFM Analysis¶

R(Recency), F(Frequency), M(Monetary)

Processing RFM values¶

In [ ]:
# When the recency get higher, it means the customer didn't come to buy for a longer period of time.
# In order to unify the direction of values of RFM, I encode the lower recency values to 5, which represents those customers come to buy more nearly. 
rfm_df = df.copy()
rfm_df["R"] = pd.qcut(rfm_df["Recency"],5,labels=[i for i in range(5,0,-1)])
display(rfm_df["R"])
display(rfm_df[["R","Recency"]])
0       3
1       4
2       4
3       4
4       1
       ..
2235    3
2236    3
2237    1
2238    5
2239    3
Name: R, Length: 2240, dtype: category
Categories (5, int64): [5 < 4 < 3 < 2 < 1]
R Recency
0 3 58
1 4 38
2 4 26
3 4 26
4 1 94
... ... ...
2235 3 46
2236 3 56
2237 1 91
2238 5 8
2239 3 40

2240 rows × 2 columns

In [ ]:
rfm_df["F"] = pd.qcut(rfm_df["Frequency"],5,labels=range(1,6))
rfm_df["F"]
Out[ ]:
0       5
1       1
2       4
3       2
4       3
       ..
2235    4
2236    3
2237    4
2238    5
2239    2
Name: F, Length: 2240, dtype: category
Categories (5, int64): [1 < 2 < 3 < 4 < 5]
In [ ]:
rfm_df["M"] = pd.qcut(rfm_df["Monetary"],5,labels=range(1,6))
rfm_df["M"]
Out[ ]:
0       5
1       1
2       4
3       1
4       3
       ..
2235    5
2236    3
2237    5
2238    4
2239    2
Name: M, Length: 2240, dtype: category
Categories (5, int64): [1 < 2 < 3 < 4 < 5]
In [ ]:
# calculate rfm value
rfm_df["RFM"] = rfm_df[["R","F","M"]].sum(axis=1) / 3
rfm_df["RFM"]
Out[ ]:
0      4.33
1      2.00
2      4.00
3      2.33
4      2.33
       ... 
2235   4.00
2236   3.00
2237   3.33
2238   4.67
2239   2.33
Name: RFM, Length: 2240, dtype: float64
In [ ]:
# divide customers into 5 groups by rfm value
rfm_df["rfm_g"] = pd.qcut(rfm_df["RFM"],5,labels=range(1,6))
rfm_df["rfm_g"]
Out[ ]:
0       5
1       1
2       4
3       2
4       2
       ..
2235    4
2236    3
2237    3
2238    5
2239    2
Name: rfm_g, Length: 2240, dtype: category
Categories (5, int64): [1 < 2 < 3 < 4 < 5]
In [ ]:
# after processing
rfm_df.head()
Out[ ]:
ID Year_Birth Education Marital_Status Income Kidhome Teenhome Dt_Customer Recency MntWines MntFruits MntMeatProducts MntFishProducts MntSweetProducts MntGoldProds NumDealsPurchases NumWebPurchases ... AcceptedCmp4 AcceptedCmp5 AcceptedCmp1 AcceptedCmp2 Complain Z_CostContact Z_Revenue Response Age membership_days Frequency Monetary R F M RFM rfm_g
0 5524 1957 Graduation Single 58,138.00 0 0 2012-04-09 58 635 88 546 172 88 88 3 8 ... 0 0 0 0 0 3 11 1 57 971 22 1617 3 5 5 4.33 5
1 2174 1954 Graduation Single 46,344.00 1 1 2014-08-03 38 11 1 6 2 1 6 2 1 ... 0 0 0 0 0 3 11 0 60 125 4 27 4 1 1 2.00 1
2 4141 1965 Graduation Together 71,613.00 0 0 2013-08-21 26 426 49 127 111 21 42 1 8 ... 0 0 0 0 0 3 11 0 49 472 20 776 4 4 4 4.00 4
3 6182 1984 Graduation Together 26,646.00 1 0 2014-10-02 26 11 4 20 10 3 5 2 2 ... 0 0 0 0 0 3 11 0 30 65 6 53 4 2 1 2.33 2
4 5324 1981 PhD Married 58,293.00 1 0 2014-01-19 94 173 43 118 46 27 15 5 5 ... 0 0 0 0 0 3 11 0 33 321 14 422 1 3 3 2.33 2

5 rows × 38 columns

Visualization¶

Total consumption by group¶

The fifth and forth groups accounts 67.8 % of consumption.

In [ ]:
color_discrete_map = {1: px.colors.qualitative.Set1[1], 2: px.colors.qualitative.Set1[2],
                      3: px.colors.qualitative.Set1[5], 4: px.colors.qualitative.Set1[4], 5: px.colors.qualitative.Set1[0],}
fig = px.pie(rfm_df, names="rfm_g", values="Monetary", hole=0.5,
             title="What percentage of total consumption does each group account for?",color="rfm_g",color_discrete_map=color_discrete_map)
fig.update_traces(textposition='outside', textinfo='percent+label',textfont_size=15)
fig.show()

Frequency¶

In [ ]:
color_discrete_map = {1: px.colors.qualitative.Set1[1], 2: px.colors.qualitative.Set1[2],
                      3: px.colors.qualitative.Set1[5], 4: px.colors.qualitative.Set1[4], 5: px.colors.qualitative.Set1[0],}
fig = px.box(rfm_df, x="rfm_g", y="Frequency", color="rfm_g",
             title="How often does each group of customers purchase?", color_discrete_map=color_discrete_map)
fig.show()

Recency¶

In [ ]:
fig = px.box(rfm_df, x="rfm_g", y="Recency", color="rfm_g",
             title="Number of days since customer's last purchase by group",
             color_discrete_map=color_discrete_map)
fig.show()

Purchase made with a discount¶

In [ ]:
fig = px.box(rfm_df, x="rfm_g", y="NumDealsPurchases", color="rfm_g",
             title="Number of purchases made with a discount",
             color_discrete_map=color_discrete_map)
fig.show()

Bubble chart of RFM¶

  • RFM Model Reference

rfm model
source: https://www.putler.com/rfm-segmentation/

In [ ]:
fig = px.scatter(rfm_df, x="Frequency", y="Recency", size="Monetary",
                 size_max=50, color=rfm_df.RFM, opacity=0.4,
                 hover_name="ID",
                 color_continuous_scale=px.colors.diverging.Portland,
                 title="RFM Analysis<br>*the size of bubbles represents the monetary",)
fig.update_layout(yaxis=dict(autorange="reversed"))
fig.show()

Income¶

In [ ]:
sns_palette = {1: sns.color_palette("Set1")[1], 2: sns.color_palette("Set1")[2],
                      3: sns.color_palette("Set1")[5], 4: sns.color_palette("Set1")[4], 5: sns.color_palette("Set1")[0],}
print("The outliers make the plot hard to read.")
plt.figure(figsize=(12,4))
plt.title("Income Distribution by groups of RFM")
sns.histplot(rfm_df, x="Income",hue="rfm_g",element="poly",palette=sns_palette)
plt.show()

print("Set the maximum of x-axis limit to be 175000")
plt.figure(figsize=(12,4))
plt.title("Income Distribution by groups of RFM")
sns.histplot(rfm_df, x="Income",hue="rfm_g",element="poly",palette=sns_palette)
plt.xlim(0,175000)
plt.show()
The outliers make the plot hard to read.
Set the maximum of x-axis limit to be 175000

Age¶

In [ ]:
fig = px.box(rfm_df, x="rfm_g", y="Age", color="rfm_g",
             title="Age Distribution by groups of RFM",
             color_discrete_map=color_discrete_map)
fig.show()

Marital Status¶

In [ ]:
marital_status = rfm_df.groupby(["rfm_g", "Marital_Status"]).agg(Count=("ID", "count")).pivot_table(index=["rfm_g"],columns="Marital_Status",values="Count")
marital_status = marital_status.apply(lambda x: x.div(x.sum()), axis=1) # calculate percentage

plt.title("Marital status by group")
ax = sns.heatmap(marital_status.transpose(),cmap="RdPu",cbar=False)
fmt = lambda x, pos: '{:.0%}'.format(x)
cbar = ax.figure.colorbar(ax.collections[0],format=FuncFormatter(fmt))
cbar.outline.set_visible(False)

plt.show()

Family Status¶

In [ ]:
home = rfm_df[["rfm_g","Teenhome","Kidhome"]].groupby("rfm_g").mean()
# plt.figure(figsize=(6,8))
plt.title("Average number of children by group")
sns.heatmap(home.transpose(),cmap="RdPu",annot=True)
plt.yticks(rotation=0) 
plt.show()

Product Category¶

In [ ]:
buy = ['MntWines', 'MntFruits','MntMeatProducts', 'MntFishProducts', 'MntSweetProducts','MntGoldProds']
g_contribution = rfm_df.groupby("rfm_g")[buy].mean().reset_index()
fig = px.bar(g_contribution, x=buy, y="rfm_g",
             orientation="h",
             title="Contribution to different categories by group",
             )
fig.update_layout(yaxis={'categoryorder': 'total ascending'},xaxis_title="Average amount")
fig.show()
In [ ]:
buy_pct = rfm_df[buy].apply(lambda x: x.div(x.sum()), axis=1)
buy_pct = buy_pct.join(rfm_df["rfm_g"])
buy_pct = buy_pct.groupby("rfm_g")[buy].mean()
buy_pct
Out[ ]:
MntWines MntFruits MntMeatProducts MntFishProducts MntSweetProducts MntGoldProds
rfm_g
1 0.36 0.06 0.25 0.09 0.06 0.18
2 0.44 0.05 0.23 0.07 0.06 0.15
3 0.50 0.04 0.24 0.06 0.05 0.11
4 0.53 0.04 0.26 0.06 0.04 0.07
5 0.50 0.05 0.28 0.06 0.05 0.06
In [ ]:
plt.title("Which category of products accounts for consumption of different groups the most?")
ax = sns.heatmap(buy_pct.transpose(),cmap="RdPu",cbar=False)
fmt = lambda x, pos: '{:.0%}'.format(x)
cbar = ax.figure.colorbar(ax.collections[0],format=FuncFormatter(fmt))
cbar.outline.set_visible(False)
plt.show()

Sales Channel¶

In [ ]:
channel = ['NumWebPurchases','NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth']
# channel_pct = rfm_df[channel].apply(lambda x: x.div(x.sum()), axis=1)
# channel_pct = channel_pct.join(rfm_df["rfm_g"])
channel_pct = rfm_df.groupby("rfm_g")[channel].mean()
channel_pct
Out[ ]:
NumWebPurchases NumCatalogPurchases NumStorePurchases NumWebVisitsMonth
rfm_g
1 1.61 0.39 2.86 6.51
2 3.08 0.94 4.04 6.19
3 4.79 2.89 6.24 5.06
4 5.73 4.64 8.19 4.29
5 6.40 5.81 9.20 3.88
In [ ]:
plt.title("Purchase and visit by group")
sns.heatmap(channel_pct.transpose(),cmap="RdPu",annot=True)
plt.show()

Campaign¶

In [ ]:
cmps = ["AcceptedCmp1","AcceptedCmp2","AcceptedCmp3","AcceptedCmp4","AcceptedCmp5","Response",]
cmp_attend = rfm_df.groupby("rfm_g")[cmps].sum().transpose()
plt.title("Campaign acceptance by group")
sns.heatmap(cmp_attend,cmap="RdPu",annot=True,fmt='.3g')
plt.show()

Predict Monetary¶

Data Preprocessing¶

In [ ]:
for_train = df.dropna()
# label encoding
edu_dict = {'Basic': 1,
            'Graduation': 2,
            'Master': 3,
            '2n Cycle': 3,
            "PhD": 4}
for_train['Education'] = for_train['Education'].map(edu_dict)

# Frequency encoding
ms_dict = dict(df["Marital_Status"].value_counts())
display(ms_dict)
for_train["Marital_Status"] = for_train["Marital_Status"].map(ms_dict)
{'Married': 864,
 'Together': 580,
 'Single': 480,
 'Divorced': 232,
 'Widow': 77,
 'Alone': 3,
 'Absurd': 2,
 'YOLO': 2}
In [ ]:
# I've transformed the first two columns to other columns, so I can drop them.
# As for the last two columns, the owner of this data didn't explain what are they, so I also drop them.
for_train.drop(['Year_Birth','Dt_Customer','Z_CostContact', 'Z_Revenue'],axis=1,inplace=True)

# The value of monetary is from these columns, so I need to drop them, too.
for_train.drop(for_train.loc[:,"MntWines":"MntGoldProds"],axis=1,inplace=True)
for_train.head()
Out[ ]:
ID Education Marital_Status Income Kidhome Teenhome Recency NumDealsPurchases NumWebPurchases NumCatalogPurchases NumStorePurchases NumWebVisitsMonth AcceptedCmp3 AcceptedCmp4 AcceptedCmp5 AcceptedCmp1 AcceptedCmp2 Complain Response Age membership_days Frequency Monetary
0 5524 2 480 58,138.00 0 0 58 3 8 10 4 7 0 0 0 0 0 0 1 57 971 22 1617
1 2174 2 480 46,344.00 1 1 38 2 1 1 2 5 0 0 0 0 0 0 0 60 125 4 27
2 4141 2 580 71,613.00 0 0 26 1 8 2 10 4 0 0 0 0 0 0 0 49 472 20 776
3 6182 2 580 26,646.00 1 0 26 2 2 0 4 6 0 0 0 0 0 0 0 30 65 6 53
4 5324 4 864 58,293.00 1 0 94 5 5 3 6 5 0 0 0 0 0 0 0 33 321 14 422
In [ ]:
corr = for_train[for_train.columns.difference(['ID'])].corr()
plt.figure(figsize=(12,10),dpi=200)
plt.title("Correlation Heatmap")
sns.heatmap(corr, square=True, cmap="RdBu_r",center=0)
Out[ ]:
<AxesSubplot: title={'center': 'Correlation Heatmap'}>
In [ ]:
corr = for_train[for_train.columns.difference(['ID'])].corr()
plt.figure(figsize=(3,9),dpi=200)
plt.title("Correlation Heatmap of Monetary")
sns.heatmap(corr[['Monetary']].sort_values(by='Monetary', ascending=False), cmap="RdBu_r",center=0,annot=True)
Out[ ]:
<AxesSubplot: title={'center': 'Correlation Heatmap of Monetary'}>

Multivariable Regression¶

In [ ]:
train_dt = for_train[for_train.columns.difference(['ID',"Monetary"])]
target = for_train.Monetary
X_train, X_test, y_train, y_test = train_test_split(train_dt, target, random_state=10, test_size=0.2)
In [ ]:
train_model = LinearRegression()
train_model.fit(X_train, y_train)
predictions = train_model.predict(X_test)
train_r2 = train_model.score(X_train, y_train)
print(f'Training data r-squared: {train_r2:.2}')
Training data r-squared: 0.81
In [ ]:
test_model = LinearRegression()
test_model.fit(X_test, y_test)
predictions = test_model.predict(X_test)
test_r2 = test_model.score(X_test, y_test)
print(f'Test data r-squared: {test_r2:.2}')
Test data r-squared: 0.83
In [ ]:
regr_coef = pd.DataFrame(train_model.coef_,index=train_dt.columns,columns=["coefficients"])
regr_coef
Out[ ]:
coefficients
AcceptedCmp1 137.16
AcceptedCmp2 -165.55
AcceptedCmp3 12.75
AcceptedCmp4 82.22
AcceptedCmp5 335.09
Age 0.32
Complain -129.08
Education -3.13
Frequency 39.22
Income 0.00
Kidhome -64.51
Marital_Status -0.01
NumCatalogPurchases 39.20
NumDealsPurchases -11.67
NumStorePurchases 2.26
NumWebPurchases -2.24
NumWebVisitsMonth -8.32
Recency 0.46
Response 44.93
Teenhome -89.26
membership_days 0.19

Evaluate the model¶

In [ ]:
preds = train_model.predict(X_test)
plt.figure(dpi=200)
plt.title('Actual vs Predicted Monetary: $y _i$ vs $\hat y_i$')
plt.xlabel('Actual Monetary $y _i$')
plt.ylabel('Prediced Monetary $\hat y _i$')
sns.regplot(x=y_test,y=preds,scatter_kws={"color":"indigo", "alpha":0.4})
plt.show()
In [ ]:
residuals = y_test - preds
plt.figure(dpi=200)
plt.title('Residuals vs Predicted Monetary')
plt.xlabel('Predicted Monetary $\hat y _i$')
plt.ylabel('Residuals')
sns.scatterplot(x=preds,y=residuals,c="indigo")
plt.show()
In [ ]:
resid_mean = round(residuals.mean(),2)
resid_skew = round(residuals.skew(),2)  # skew 是偏態
plt.figure(dpi=200)
sns.displot(residuals, kde=True,)
plt.title(f'Residuals Skew ({resid_skew}) Mean ({resid_mean})')
plt.show()
<Figure size 1280x960 with 0 Axes>

Prediction¶

In [ ]:
average_vals = train_dt.mean().values
property_stats = pd.DataFrame(data=average_vals.reshape(1, len(train_dt.columns)), 
                              columns=train_dt.columns)
pred_monetary = (train_model.predict(property_stats))[0]
print(f"Under average conditions, the monetary of that customer would be {pred_monetary:.2f}")
Under average conditions, the monetary of that customer would be 600.52
In [ ]:
property_stats["Response"] = 1
pred_monetary = (train_model.predict(property_stats))[0]
print(f"If a customer accepted the offer in the last campaign, the monetary of that customer would be {pred_monetary:.2f}")
If a customer accepted the offer in the last campaign, the monetary of that customer would be 638.70
In [ ]:
property_stats.loc[0,"AcceptedCmp1":"AcceptedCmp5"] = 0
property_stats["Response"] = 0
pred_monetary = (train_model.predict(property_stats))[0]
print(f"If a customer didn't accept any offer in the campaigns, the monetary of that customer would be {pred_monetary:.2f}")
If a customer didn't accept any offer in the campaigns, the monetary of that customer would be 555.70
In [ ]:
property_stats.loc[0,"AcceptedCmp1":"AcceptedCmp5"] = 1
property_stats["Response"] = 1
pred_monetary = (train_model.predict(property_stats))[0]
print(f"If a customer accepted all offers in the campaigns, the monetary of that customer would be {pred_monetary:.2f}")
If a customer accepted all offers in the campaigns, the monetary of that customer would be 1002.30

Thank you so much for taking the time to read this analysis.¶

If there's any suggestion, please feel free to leave your comments. And if you think this notebook is useful, you can fork or upvote it.